/*
This do file takes one simulated dataset (Scenario 4 Sample Size 500, medium survival, large heterogeneity)
and shows the analysis approach adopted in the TSD.


This do file fits flexible parametric survival models incorporating background mortality.
*/
cd "${DRIVE}/GitSoftware/TSD_simulation/Files_for_Appendix\Example"

// stexpect3 calculates marginal expected survival and hazard rates
adopath ++ ../../stexpect3


use sim1, clear

stset t,f(d==1,2)
gen sex = 1

// Kaplan-Meier plot
sts graph, ytitle("All-cause survival") xtitle("Time from randomisation") 
sts gen S_km = s


// calculate expected survival and hazard functions
// increase observations as this is evaluated at 1000 points.
set obs 1001
gen datediag = mdy(1,1,2009) + runiform(1,365)
//stexpect3 using life_tables_1971_2009_england, agediag(age) ///
//   datediag(datediag) pmother(sex) pmage(age) pmyear(calendar_year) every(0.2) maxt(80) pmmaxyear(2009) pmmaxage(99)
//replace exphaz = exphaz*1000

// merge in expected mortality rates
// in simulated data we just used rates for a single calendar year
rename age agediag
// attained age
gen age = min(floor(agediag + _t),99)
// attained year
gen calendar_year = 2009

merge m:1 calendar_year sex age using life_tables_1971_2009_england , ///
          keep(match master)

// calculate expected survival in small intervals up to three years
stexpect3 using life_tables_1971_2009_england, agediag(age) ///
   datediag(datediag) pmother(sex) pmage(age) pmyear(calendar_year) every(0.05) maxt(3) pmmaxyear(2009) pmmaxage(99)
rename t_exp t_exp3
rename expsurv expsurv3
rename  exphaz exphaz3
replace exphaz3 = exphaz3*1000
          
// FPM models
// Use 3, 4 and 5 df as an example
forvalues df = 3/5  {
	stpm2, scale(hazard) df(`df') bhazard(rate) 
	estimates store stpm2_`df'
	predict RS1_df`df', surv timevar(t_exp3)
	predict eh1_df`df', hazard timevar(t_exp3)
	replace eh1_df`df' = eh1_df`df'*1000
  gen S1_df`df' = RS1_df`df'*expsurv3
  gen h1_df`df' = eh1_df`df' + exphaz3
	estat ic
	local AIC_`df': display %4.3f el(r(S),1,5)
}

// Plot survival function
twoway (line S_km _t, sort connect(stairstep)) ///
		(line S1_df3 t_exp3, sort) ///
		(line S1_df4 t_exp3, sort) ///
		(line S1_df5 t_exp3, sort) ///
		(line expsurv3 t_exp3, lpattern(dot) lcolor(black)) ///
		,legend(order(1 "K-M" 2 "3 df (AIC=`AIC_3')" 3 "4 df (AIC=`AIC_4')" 4 "5 df (AIC=`AIC_5')" 5 "Expected") pos(7) cols(2) ring(0) size(small)) ///
		ylabel(0(0.2)1) ///
		ytitle("Survival function") ///
		title("Flexible Parametric Survival Models") ///
		name(surv1, replace)
    
// Plot hazard functions
// high innitial hazard removed 
twoway (line S_km _t, sort connect(stairstep) lcolor(white)) /// Add a white dummy line for K-M 
		(line h1_df3 t_exp3 , sort) ///
		(line h1_df4 t_exp3 , sort) ///
		(line h1_df5 t_exp3 , sort) ///
		(line exphaz3 t_exp3 , lpattern(dot) lcolor(black)) ///
		,legend(order(2 "3 df (AIC=`AIC_3')" 3 "4 df (AIC=`AIC_4')" 4 "5 df (AIC=`AIC_5')" 5 "Expected") pos(1) cols(2) ring(0) size(small)) ///
		ylabel(100 200 500 1000,angle(h)) ///
    /*yscale(log)*/ ///
		ytitle("Mortality rate (per 1000 py)") ///
		title("Flexible Parametric Survival Models (background mortality)") ///
		name(haz1, replace)    
    
// Estimates of RMST at three years    
forvalues df = 3/5  {
  qui integ S1_df`df' t_exp3	
  display "RMST at 3 years with `df' df: " %5.3f `r(integral)' " years"
}
  
// Now plot extrapolated curves to 50 years
// Also calculate mean survival
stexpect3 using life_tables_1971_2009_england, agediag(age) ///
   datediag(datediag) pmother(sex) pmage(age) pmyear(calendar_year) every(0.05) maxt(50) pmmaxyear(2009) pmmaxage(99)
rename t_exp t_exp50
rename expsurv expsurv50
rename  exphaz exphaz50
replace exphaz50 = exphaz50*1000

forvalues df = 3/5  {
  estimates restore stpm2_`df'
  predict RS2_df`df', surv timevar(t_exp50)
  predict eh2_df`df', hazard timevar(t_exp50)
  replace eh2_df`df' = eh2_df`df'*1000
  gen S2_df`df' = RS2_df`df'*expsurv50
  gen h2_df`df' = eh2_df`df' + exphaz50  
}

twoway (line S_km _t, sort connect(stairstep)) ///
		(line S2_df3 t_exp50, sort) ///
		(line S2_df4 t_exp50, sort) ///
		(line S2_df5 t_exp50, sort) ///
		(line expsurv50 t_exp50 , lpattern(dot) lcolor(black)) ///
		,legend(order(1 "K-M" 2 "3 df (AIC=`AIC_3')" 3 "4 df (AIC=`AIC_4')" 4 "5 df (AIC=`AIC_5')" 5 "Expected") pos(1) cols(2) ring(0) size(small)) ///
		title("Flexible Parametric Survival Models (background mortality)") ///
		ytitle("Survival function") ///
		name(surv2, replace)
				
        
// only plot expected hazard when expected survival is >0.01    
twoway 	(line S_km _t, sort connect(stairstep) color(white)) /// Add a white dummy line for K-M  
		(line h2_df3 t_exp50 , sort) ///
		(line h2_df4 t_exp50 , sort) ///
		(line h2_df5 t_exp50 , sort) ///
		(line exphaz50 t_exp50 if expsurv50>0.01, lpattern(dot) lcolor(black)) ///
		,legend(order(2 "3 df (AIC=`AIC_3')" 3 "4 df (AIC=`AIC_4')" 4 "5 df (AIC=`AIC_6')" 5 "Expected") pos(1) cols(2) ring(0) size(small)) ///
		title("FPM Models: Extrapolated hazard function") ///
		ytitle("Mortality rate (per 1000 person years)") ///
		name(haz2, replace)			

// mean survival (evaluted at 50 years)
// S(t) effectively zero by 50 years, 
forvalues df = 3/5  {
  qui integ S2_df`df' t_exp50	
  display "Mean survival with `df' df: " %5.3f `r(integral)' " years"
}
